#############################################
# Microarray Basics
#############################################
library(affy)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colMeans,
## colnames, colSums, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply,
## lengths, Map, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
## rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Warning in read.dcf(con): URL 'http://bioconductor.org/BiocInstaller.dcf':
## status was 'Couldn't resolve host name'
library(affydata)
## Package LibPath Item
## [1,] "affydata" "/Users/dancikg/Library/R/3.4/library" "Dilution"
## Title
## [1,] "AffyBatch instance Dilution"
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
############################################################
# Look at the Dilution dataset: this contains 4 liver
# tissue hybridized in concentrationsof 10 and 20 micrograms
# to scanner 1(A) and scanner 2(B). Note that Dilution is an
# object of type AffyBatch (or an extension of an
# ExpressionSet (eSet) object)
######################################################
data(Dilution)
Dilution
## Warning: replacing previous import 'AnnotationDbi::tail' by 'utils::tail'
## when loading 'hgu95av2cdf'
## Warning: replacing previous import 'AnnotationDbi::head' by 'utils::head'
## when loading 'hgu95av2cdf'
##
## AffyBatch object
## size of arrays=640x640 features (35221 kb)
## cdf=HG_U95Av2 (12625 affyids)
## number of samples=4
## number of genes=12625
## annotation=hgu95av2
## notes=
####################################################
# Methods (functions) for AffyBatch objects
####################################################
sampleNames(Dilution) # the names of the samples
## [1] "20A" "20B" "10A" "10B"
experimentData(Dilution) # experiment information
## Experiment data
## Experimenter name: Gene Logic
## Laboratory: Gene Logic
## Contact information: 708 Quince Orchard Road
## Gaithersburg, MD 20878
## Telephone: 1.301.987.1700
## Toll Free: 1.800.GENELOGIC (US and Canada)
## Facsimile: 1.301.987.1701
##
## Title: Small part of dilution study
## URL: http://qolotus02.genelogic.com/datasets.nsf/
## PMIDs:
##
## Abstract: A 68 word abstract is available. Use 'abstract' method.
## notes:
## :
##
annotation(Dilution) # annotation (the microarray used)
## [1] "hgu95av2"
phenoData(Dilution) # get phenotypic (clinical) data
## An object of class 'AnnotatedDataFrame'
## sampleNames: 20A 20B 10A 10B
## varLabels: liver sn19 scanner
## varMetadata: labelDescription
pData(Dilution) # phenotypic data in table form
## liver sn19 scanner
## 20A 20 0 1
## 20B 20 0 2
## 10A 10 0 1
## 10B 10 0 2
varMetadata(Dilution) # description of phenotypic data
## labelDescription
## liver amount of liver RNA hybridized to array in micrograms
## sn19 amount of central nervous system RNA hybridized to array in micrograms
## scanner ID number of scanner used
####################################################
# Let's look at each microarray
####################################################
image(Dilution, col = heat.colors(500))




####################################################
# Produces a boxplot of log base 2 intensities #
# Does it seem fair to compare gene expression
# across concentrations or scanners?
####################################################
boxplot(Dilution, col = 1:4, ylab = "log2 expression")

####################################################
# Normalization of microarray data involves
# 1. Background correction to remove noise
# 2. Normalization of samples
# 3. Estimation of "average" probe intensity values
####################################################
################################################################
# We will use Robust Multi-array average (RMA) which involves
# 1. Background correction of probe level intensity values
# 2. Quantile Normalization
# 3. Estimation of average probe set values on log2 scale
################################################################
Dilution.rma <- rma(Dilution) # perform RMA
## Background correcting
## Normalizing
## Calculating Expression
Dilution.expr <- exprs(Dilution.rma) # extract the expression values
################################################################
# confirm that samples are quantile normalized
################################################################
boxplot(Dilution.expr, col = 2:5, ylab = "log2 expression",
main = "Small part of dilution study (after RMA normalization)",
xlab = "Sample")

################################################################
# Let's look at a leukemiadataset, and compare
# acute lymphoblastic leukemia (ALL) samples with healthy,
# non-leukemia (noL) bone marrow samples
################################################################
library(leukemiasEset)
data(leukemiasEset)
################################################################
# Extract phenotype data. How many samples of each leukemia
# type is there?
################################################################
leukemia.p <- pData(leukemiasEset)
table(leukemia.p$LeukemiaType)
##
## ALL AML CLL CML NoL
## 12 12 12 12 12
################################################################
# This data is already processed using RMA, so we can extract
# the expression data
################################################################
leukemia.expr <- exprs(leukemiasEset)
# confirm that data has been processed #
boxplot(leukemia.expr, main = "Leukemia samples", ylab = "log2 expression")

######################################################################
# Let's compare expression between ALL and healthy bone marrow
# samples for for the following probes:
# ENSG00000171960 - corresponds to gene PPIH
# (https://www.ncbi.nlm.nih.gov/gene/10465)
# ENSG00000135679 - corresponds to gene MDM2
# (https://www.ncbi.nlm.nih.gov/gene/4193)
# We want to calculuate the fold change (see below) and the p-value
# evaluating whether or not any difference in expression between
# ALL and healthy samples are statistically significant
# (in other words, are the genes differentially expressed?)
######################################################################
# find expression for desired probe
m <- match("ENSG00000171960", rownames(leukemia.expr))
m
## [1] 12857
df <- data.frame(expr = leukemia.expr[m,], type = leukemia.p$LeukemiaType)
ggplot(df,aes(type, expr, fill = type)) + geom_boxplot() +
theme_classic() + theme(legend.position = "none") +
labs(x = "Leukemia Type", y = "Log2 Expression") +
ggtitle("PPIH (ENSG00000171960) expression")

# let's only look at ALL and NoL
df <- filter(df, type == "ALL" | type == "NoL")
ggplot(df,aes(type, expr, fill = type)) + geom_boxplot() +
theme_classic() + theme(legend.position = "none") +
labs(x = "Leukemia Type", y = "Log2 Expression") +
ggtitle("PPIH (ENSG00000171960) expression")

########################################################################
# Calculate fold change (FC) which is average expression in the
# first group divided by the average expression in the second group,
# Since data is on the log2 scale, we must convert back to normal scale
########################################################################
# returns a list of values with elements of the first vector split by
# elements of the second vector; if the second argument is a factor
# and drop is TRUE, we drop levels that do not occur
s <- split(df$expr, df$type, drop = TRUE)
l <- lapply(s, mean)
logFC <- l$ALL - l$NoL
FC <- 2**logFC ## data is on log2 scale
main = paste("PPIH (ENSG00000171960) expression\nFC = ", round(FC,2))
ggplot(df,aes(type, expr, fill = type)) + geom_boxplot() +
theme_classic() + theme(legend.position = "none") +
labs(x = "Leukemia Type", y = "Log2 Expression") +
ggtitle(main)

########################################################################
# Is the difference in fold change statistically significant??
# H0: mu_ALL - mu_normal = 0
# HA: mu_ALL - mu_normal != 0
# where mu_ALL is the mean expression of ALL samples for the
# probe of interest and mu_normal is the mean expression of
# normal samples
# How do we carry out a hypothesis test comparing two population means?
# (we will do this in class)
########################################################################
res <- t.test(s$ALL, s$NoL)
res
##
## Welch Two Sample t-test
##
## data: s$ALL and s$NoL
## t = 0.33335, df = 17.201, p-value = 0.7429
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2543382 0.3498908
## sample estimates:
## mean of x mean of y
## 8.981066 8.933290
res$p.value
## [1] 0.7429002
# we fail to reject the null. There is not sufficient
# evidence that the gene PPIH is differentially expressed
# between ALL and normal samples
########################################################################
# Repeat analysis for probe ENSG00000135679 (MDM2)
########################################################################
# get row index of probe, then create data frame, including only
# ALL and NoL samples
m <- match("ENSG00000135679", rownames(leukemia.expr))
df <- data.frame(expr = leukemia.expr[m,], type = leukemia.p$LeukemiaType)
df <- filter(df, type == "ALL" | type == "NoL")
ggplot(df,aes(type, expr, fill = type)) + geom_boxplot() +
theme_classic() + theme(legend.position = "none") +
labs(x = "Leukemia Type", y = "Log2 Expression") +
ggtitle("MDM2 (ENSG00000135679) expression")

# Is there evidence that MDM2 is differentially expressed between
# ALL and normal samples? (we will complete this in class)
s <- split(df$expr, df$type, drop = TRUE)
l <- lapply(s, mean)
logFC <- l$ALL - l$NoL
FC <- 2**logFC ## data is on log2 scale
res <- t.test(s$ALL, s$NoL)
res$p.value
## [1] 0.004616965
# because P < 0.05, we reject H0 and conclude that
# MDM2 is differentially expressed between ALL
# and NoL samples